####### REPLICATION MATERIALS FOR "RACIAL INEQUALITY IN WAR" #######

# Load packages
library(MASS)
library(dplyr)
library(tidyr)
library(stargazer)
library(ggplot2)
library(patchwork)
library(readr)

# Set working directory, pointing to the main replication folder
setwd("set/path/to/working/directory")

#### Import data ####

## Data on casualties at the unit level
data_all = read_csv("./data/wwi_unit_period_data.csv")

## Data on unit types
unit_types = read_csv("./data/wwi_unit_type_data.csv")


# Create key data subsets
data_core_np = data_all |> filter(core==1 & pioninf==0) # Core of war, no pioneer infantry units
data_core_p = data_all |> filter(core==1 & pioninf==1)  # Core of war, only pioneer infantry units
data_post_np = data_all |> filter(post==1 & pioninf==0) # Post-armistice, no pioneer infantry units
data_post_p = data_all |> filter(post==1 & pioninf==1)  # Post-armistice, only pioneer infantry units

# Time period labels
periodskey = data_all |> distinct(period, periodct)     # Formatted as YYMMHH, where HH is the half-month


#### Basic descriptives ####

# Total relevant fatalities in entire dataset
sum(data_all$dead_spec) # 44,914

# Fatalities by race, raw
sum(data_all$dead_spec[which(data_all$black==0)]) # White: 43,367
sum(data_all$dead_spec[which(data_all$black==1)]) # Black: 1,547

# Fatality rates by race, as proportion of all casualties
sum(data_all$dead_spec[which(data_all$black==0)])/sum(data_all$dead_spec) # White: 96.6%
sum(data_all$dead_spec[which(data_all$black==1)])/sum(data_all$dead_spec) # Black: 3.4%

# Fatality rates by race, as proportion of all troops by race (total troop estimates in denominators come from Keene 2002)
sum(data_all$dead_spec[which(data_all$black==0)])/1800000 # White: 2.4%
sum(data_all$dead_spec[which(data_all$black==1)])/200000  # Black: 0.8%

# Number of observations
nrow(data_core_np) # 1,770

# Number of units by type and race
length(unique(data_core_np$unit))                        # 168 total infantry units
length(unique(data_core_np$unit[data_core_np$black==1])) # 8 Black infantry units
length(unique(data_core_np$unit[data_core_np$black==0])) # 160 White infantry units

# Number of pre-armistice fatalities (excluding pioneer infantry)
sum(data_core_np$dead_spec) # 41,148




#### RESULT 1: Black Soldiers Disproportionately Assigned to Support Units ####

# Personnel totals, but only focusing on roles where there were also Black soldiers
unit_types_sub = unit_types |> 
  filter(subunit!="Brigade Headquarters") |> # Drop the headquarters units (small #s and low racial alignment in any case)
  filter(!is.na(black_personnel_overseas)) |>
  mutate(all_personnel_overseas1000s = black_personnel_overseas1000s + white_personnel_overseas1000s,
         id2 = gsub(" / ", "\n", id),
         id2 = gsub("Q. M. C. Special and Technical Units\n|M. T. C. Special and Technical Units\n", "", id2)) |>
  arrange(combatrole, all_personnel_overseas1000s) |>
  mutate(order = row_number())

# Make data long for plotting
unit_types_sub = unit_types_sub |>
  pivot_longer(cols=c("white_personnel_overseas1000s", "black_personnel_overseas1000s"), names_to="race", values_to = "total")

# Create variables for unit type and race
unit_types_sub = unit_types_sub |>
  mutate(class = ifelse(race=="white_personnel_overseas1000s" & combatrole=="combat", "White Combat",
                        ifelse(race=="white_personnel_overseas1000s" & combatrole=="support", "White Support",
                               ifelse(race=="black_personnel_overseas1000s" & combatrole=="combat", "Black Combat",
                                      "Black Support"))),
         class = factor(class, levels=c("White Combat", "Black Combat", "White Support", "Black Support")),
         race = ifelse(class %in% c("White Combat", "White Support"), "White", "Black"),
         race = factor(race, levels=c("White", "Black")))


##### Figure 1: Black personnel disproportionately assigned to support roles #####
ggplot(unit_types_sub, aes(reorder(id2, order), total)) + 
  geom_bar(aes(fill=class), stat="identity") +
  theme_bw() + 
  scale_fill_manual("", values=c("White Combat" = "tomato", "White Support" = "skyblue1", "Black Combat" = "red4", "Black Support" = "dodgerblue4")) + 
  theme(axis.text.x = element_text(angle=60, vjust=1, hjust=1, size=6)) + 
  theme(panel.grid.major.x = element_line(linewidth=0.1), axis.ticks = element_blank()) + 
  xlab("Unit Type") +
  ylab("Estimated Prescribed\nPersonnel Deployed (1,000s)") +
  coord_cartesian(xlim=c(0,16))
ggsave("./figures/Fig1.pdf", width=7, height=5)


##### Figure 2: Personnel assignment to combat and support roles, aggregating across occupational specialties #####
# Breakdown of combat vs. support for each race
cs_race = ggplot(unit_types_sub, aes(race, total, fill=class)) + 
  geom_bar(aes(fill=class), stat="identity") + 
  scale_fill_manual("Unit Type", values=c("White Combat" = "tomato", "White Support" = "skyblue1", "Black Combat" = "red4", "Black Support" = "dodgerblue4")) + 
  scale_y_continuous("Estimated Prescribed\nPersonnel Deployed (1,000s)", breaks = c(0,250,500,750,1000,1250)) +
  scale_x_discrete("") +
  coord_cartesian(ylim=c(0, 1300)) +
  guides(fill = guide_legend(nrow = 2, byrow = T)) +
  theme_bw()

# Breakdown of White vs. Black for each unit type
wb_type = ggplot(unit_types_sub, aes(combatrole, total, fill=class)) + 
  geom_bar(aes(fill=class), stat="identity") + 
  scale_fill_manual("Unit Type", values=c("White Combat" = "tomato", "White Support" = "skyblue1", "Black Combat" = "red4", "Black Support" = "dodgerblue4")) + 
  scale_y_continuous("Estimated Prescribed\nPersonnel Deployed (1,000s)", breaks = c(0,250,500,750,1000,1250)) +
  scale_x_discrete("", labels=c("Combat", "Support")) + 
  coord_cartesian(ylim=c(0, 1300)) +
  guides(fill = guide_legend(nrow = 2, byrow = T)) +
  theme_bw() 

# Put both plots together
(cs_race | wb_type) + plot_layout(guides = "collect", axes = "collect") & theme(legend.position = "bottom", 
                                                                                legend.box.margin = margin(-15, unit="pt"))
ggsave("./figures/Fig2.pdf", width=4.5, height=4)


#### RESULT 2: White Soldiers Die More when Returns to Combat Effectiveness are High ####

# Some of the highest period-level fatalities in units (two exceed 350)
rev(sort(data_core_np$dead_spec))[1:10]      # Raw
log(rev(sort(data_core_np$dead_spec))[1:10]) # Logged

# Calculate per-period average fatalities by race
pers = sort(unique(data_all$period)) # Get all periods 
peravg = matrix(NA, nrow=length(unique(data_all$period)), ncol=12) # Empty matrix for results

for (i in 1:length(unique(data_all$period))) {
  # Get data from one period
  grab = data_all[data_all$period==pers[i],]
  
  # Period label 
  peravg[i,1] = pers[i]
  
  # Whether period is in core phase of war
  peravg[i,2] = grab$core[1]
  
  # Whether period is in post-armistice phase of war 
  peravg[i,3] = grab$post[1]	
  
  # Average overall, Black, and White deaths
  peravg[i,4] = ifelse(length(grab$dead_spec)>0, mean(grab$dead_spec, na.rm=TRUE), NA)
  peravg[i,5] = ifelse(length(grab$dead_spec[grab$black==1])>0, mean(grab$dead_spec[grab$black==1], na.rm=TRUE), NA)
  peravg[i,6] = ifelse(length(grab$dead_spec[grab$black==0])>0, mean(grab$dead_spec[grab$black==0], na.rm=TRUE), NA)
  
  # Average overall, Black, and White infantry deaths
  peravg[i,7] = ifelse(length(grab$dead_spec[grab$pioninf==0])>0, mean(grab$dead_spec[grab$pioninf==0], na.rm=TRUE), NA)
  peravg[i,8] = ifelse(length(grab$dead_spec[grab$black==1 & grab$pioninf==0])>0, mean(grab$dead_spec[grab$black==1 & grab$pioninf==0], na.rm=TRUE), NA)
  peravg[i,9] = ifelse(length(grab$dead_spec[grab$black==0 & grab$pioninf==0])>0, mean(grab$dead_spec[grab$black==0 & grab$pioninf==0], na.rm=TRUE), NA)
  
  # Average overall, Black, and White pioneer infantry deaths
  peravg[i,10] = ifelse(length(grab$dead_spec[grab$pioninf==1])>0, mean(grab$dead_spec[grab$pioninf==1], na.rm=TRUE), NA)
  peravg[i,11] = ifelse(length(grab$dead_spec[grab$black==1 & grab$pioninf==1])>0, mean(grab$dead_spec[grab$black==1 & grab$pioninf==1], na.rm=TRUE), NA)
  peravg[i,12] = ifelse(length(grab$dead_spec[grab$black==0 & grab$pioninf==1])>0, mean(grab$dead_spec[grab$black==0 & grab$pioninf==1], na.rm=TRUE), NA)		
}
pavg = as.data.frame(peravg)
colnames(pavg) = c("period","core","post","dead_spec","dead_spec_b","dead_spec_w","inf_dead_spec","inf_dead_spec_b","inf_dead_spec_w","pion_dead_spec","pion_dead_spec_b","pion_dead_spec_w")

# Merge in the necessary numeric period labels 
pavg = merge(pavg, periodskey, by="period")

# Unit-level White fatalities
b0c1p0_period = data_core_np$periodct[data_core_np$black==0]
b0c1p0_dead = data_core_np$dead_spec[data_core_np$black==0]
b0c1p0 = data.frame(periodct=b0c1p0_period, dead_spec=b0c1p0_dead, type="White")

# Unit-level Black fatalities
b1c1p0_period = data_core_np$periodct[data_core_np$black==1]
b1c1p0_dead = data_core_np$dead_spec[data_core_np$black==1]
b1c1p0 = data.frame(periodct=b1c1p0_period, dead_spec=b1c1p0_dead, type="Black")

# Long data of unit-level White and Black fatalities
combat_deaths_unit_data = rbind(b0c1p0, b1c1p0)

# Average White fatalities per period
pavg_white_ct = pavg$periodct[pavg$core==1]
pavg_white_dead = pavg$inf_dead_spec_w[pavg$core==1]
pavg_white = data.frame(periodct=pavg_white_ct, dead_spec=pavg_white_dead, type="White")

# Average Black fatalities per period
pavg_black_ct = pavg$periodct[pavg$core==1]
pavg_black_dead = pavg$inf_dead_spec_b[pavg$core==1]
pavg_black = data.frame(periodct=pavg_black_ct, dead_spec=pavg_black_dead, type="Black")

# Long data of average White and Black fatalities per period
combat_deaths_avg_data = rbind(pavg_white, pavg_black)


##### Figure 3: US combat fatalities by regular infantry regiment #####

# Create date labels for the plots
datelabels = periodskey$period[seq(2, 22, by=8)]
dateyears = paste0("19", substr(datelabels, 1, 2))
datemonths = substr(datelabels, 3, 4)
datehalfs = substr(datelabels, 6, 6)
datelabels_core = paste(dateyears, datemonths, datehalfs, sep="-")

# Create the figure
combat_deaths_unit_data |> filter(periodct <= 32) |>
  ggplot(aes(periodct, log(dead_spec+1))) + geom_point(aes(color=type), alpha=0.15) +
  theme_bw() + scale_color_manual("Regiment", values=c("red4","dodgerblue1")) +
  ylab("Unit Fatalities (Logged)") + xlab("Period (Year-Month-Half)") +
  scale_x_continuous(breaks=seq(11, 32, 8), labels=datelabels_core,
                     minor_breaks = seq(10, 32, 1)) +
  geom_line(data=combat_deaths_avg_data, aes(periodct, log(dead_spec+1), color=type), linewidth=1) +
  facet_grid(cols = vars(type)) + coord_cartesian(ylim=c(0, 6)) +
  theme(legend.position = "none", panel.spacing = unit(0.75, "lines"))
ggsave("./figures/Fig3.pdf", width=5, height=3.75)

# Descriptive statistic on average per-unit fatalities during main part of conflict, by race
data_core_np |> group_by(black) |> summarize(mean(dead_spec)) # 24.3 for White, 6.67 for Black


##### Table 2: Pre-armistice fatalities by race #####

# Bivariate model
regmain = lm(dead_spec ~ black, data=data_core_np)

# Add period fixed effects
regmainFE = lm(dead_spec ~ black + as.factor(period), data=data_core_np)

# Final table of results
stargazer(regmain, regmainFE, 
          align=F, style="apsr", 
          omit.stat=c("f","ser","adj.rsq","rsq"),
          digits=2, 
          covariate.labels=c("Black Regiment"),
          omit=c("period"))


##### Comparisons of White and Black units of Meuse-Argonne offensive #####

# Get relevant observations for I Corps
meuse_arg = data_all |> filter(period %in% c("180902", "181001") &
                                 unit %in% c("365 Inf", "366 Inf", "367 Inf", "368 Inf",
                                             "305 Inf", "306 Inf", "307 Inf", "308 Inf",
                                             "109 Inf", "110 Inf", "111 Inf", "112 Inf",
                                             "137 Inf", "138 Inf", "139 Inf", "140 Inf",
                                             "325 Inf", "326 Inf", "327 Inf", "328 Inf"))

# Average regiment-level deaths in first period of the offensive (last half of September 1918)
round(mean(meuse_arg$dead_spec[meuse_arg$black==1 & meuse_arg$period==180902]), 1) # Black: 9.0
round(mean(meuse_arg$dead_spec[meuse_arg$black==0 & meuse_arg$period==180902]), 1) # White: 82.2

# Average regiment-level deaths in second period of the offensive (first half of October 1918)
round(mean(meuse_arg$dead_spec[meuse_arg$black==1 & meuse_arg$period==181001]), 1) # Black: 2.8
round(mean(meuse_arg$dead_spec[meuse_arg$black==0 & meuse_arg$period==181001]), 1) # White: 117.6



#### RESULT 3: Black Soldiers Die from Accidents and Disease at Higher Rates ####

# Calculate per-period average fatalities by race for post-armistice, non-combat deaths
peravgnc = matrix(NA, nrow=length(unique(data_all$period)), ncol=6) # Empty matrix for results

for (i in 1:length(unique(data_all$period))) {
  # Get data from one period
  grab = data_all[data_all$period==pers[i],]
  
  # Period label
  peravgnc[i,1] = pers[i]
  
  # Whether period is in core phase of war
  peravgnc[i,2] = grab$core[1]
  
  # Whether period is in post-armstice phase of war
  peravgnc[i,3] = grab$post[1]	
  
  # Average overall, Black, and White non-combat deaths
  peravgnc[i,4] = ifelse(length(grab$dead_spec[grab$pioninf==0])>0, mean(grab$noncombat_death[grab$pioninf==0], na.rm=TRUE), NA)
  peravgnc[i,5] = ifelse(length(grab$dead_spec[grab$black==1 & grab$pioninf==0])>0, mean(grab$noncombat_death[grab$black==1 & grab$pioninf==0], na.rm=TRUE), NA)
  peravgnc[i,6] = ifelse(length(grab$dead_spec[grab$black==0 & grab$pioninf==0])>0, mean(grab$noncombat_death[grab$black==0 & grab$pioninf==0], na.rm=TRUE), NA)
}
pavgnc = as.data.frame(peravgnc)
colnames(pavgnc) = c("period","core","post","noncombat_dead","noncombat_dead_b","noncombat_dead_w")

# Merge in the necessary numeric period labels 
pavgnc = merge(pavgnc, periodskey, by="period")

# Unit-level White non-combat fatalities
b0p1_ct = data_post_np$periodct[data_post_np$black==0]
b0p1_dead = data_post_np$noncombat_death[data_post_np$black==0]
b0p1 = data.frame(periodct=b0p1_ct, noncombat_dead=b0p1_dead, type="White")

# Unit-level Black non-combat fatalities
b1p1_ct = data_post_np$periodct[data_post_np$black==1]
b1p1_dead = data_post_np$noncombat_death[data_post_np$black==1]
b1p1 = data.frame(periodct=b1p1_ct, noncombat_dead=b1p1_dead, type="Black")

# Long data of unit-level White and Black non-combat fatalities
noncombat_deaths_unit_data = rbind(b0p1, b1p1)

# Average White non-combat fatalities per period
pavgnc_white_ct = pavgnc$periodct[pavgnc$post==1]
pavgnc_white_dead = pavgnc$noncombat_dead_w[pavgnc$post==1]
pavgnc_white = data.frame(periodct=pavgnc_white_ct, noncombat_dead=pavgnc_white_dead, type="White")

# Average Black non-combat fatalities per period
pavgnc_black_ct = pavgnc$periodct[pavgnc$post==1]
pavgnc_black_dead = pavgnc$noncombat_dead_b[pavgnc$post==1]
pavgnc_black = data.frame(periodct=pavgnc_black_ct, noncombat_dead=pavgnc_black_dead, type="Black")

# Long data of average White and Black non-combat fatalities per period
noncombat_deaths_avg_data = rbind(pavgnc_white, pavgnc_black)


##### Figure 4: Post-armistice non-combat US fatalities by regular infantry regiment #####

# Create date labels for the plots
datelabels2 = periodskey$period[seq(22, 47, by=14)]
dateyears2 = paste0("19", substr(datelabels2, 1, 2))
datemonths2 = substr(datelabels2, 3, 4)
datehalfs2 = substr(datelabels2, 6, 6)
datelabels_post = paste(dateyears2, datemonths2, datehalfs2, sep="-")

# Create the figure
noncombat_deaths_unit_data |>
  ggplot(aes(periodct, log(noncombat_dead+1))) + 
  geom_point(aes(color=type, alpha=type)) + theme_bw() + 
  scale_color_manual("Regiment", values=c("red4","dodgerblue1")) + 
  ylab("Unit Fatalities (Logged)") + xlab("Period (Year-Month-Half)") + 
  scale_x_continuous(breaks=seq(31, 55, 14), labels=datelabels_post, minor_breaks = seq(31, 55, 1)) + 
  geom_line(data=noncombat_deaths_avg_data, aes(periodct, log(noncombat_dead+1), color=type), linewidth=1) + 
  coord_cartesian(xlim=c(32, 54), ylim=c(0, 3.1)) +
  facet_grid(cols = vars(type)) +
  scale_alpha_manual(values=c(0.3, 0.08)) +
  theme(legend.position = "none")
ggsave("./figures/Fig4.pdf", width=5, height=3.75)


##### Table 3: Post-armistice non-combat fatalities by race #####

# Bivariate model 
regpostnc = lm(noncombat_death ~ black, data=data_post_np)

# Add period fixed effects
regpostFEnc = lm(noncombat_death ~ black + as.factor(period), data=data_post_np)

# Final table of results
stargazer(regpostnc, regpostFEnc,
          align=F, style="apsr",
          omit.stat=c("f","ser","adj.rsq","rsq"), 
          digits=2,
          covariate.labels=c("Black Regiment"),
          omit=c("period"),
          column.sep.width = "10pt")


# Descriptive statistics on average regiment-level non-combat deaths during post-armistice phase of war
round(mean(data_post_np$noncombat_death[data_post_np$black==1 & data_post_np$period<190202], na.rm=TRUE), 1) # White: 2.2
round(mean(data_post_np$noncombat_death[data_post_np$black==0 & data_post_np$period<190202], na.rm=TRUE), 1) # Black: 1.4

round(mean(data_post_np$noncombat_death[data_post_np$black==1 & data_post_np$period<190202], na.rm=TRUE)/
        mean(data_post_np$noncombat_death[data_post_np$black==0 & data_post_np$period<190202], na.rm=TRUE)-1, 3) # 53.7% higher deaths in Black units



##### Figure 5: Non-combat deaths as a proportion of all deaths with a known cause during the pre-armistice portion of the war #####

# Proportion of Black pre-armistice regular infantry deaths from non-combat causes
brnc = sum(data_core_np$noncombat_death[data_core_np$black==1])/
  (sum(data_core_np$dead_total[data_core_np$black==1])-sum(data_core_np$cause_unknown[data_core_np$black==1]))
brnc  # 0.17

# Proportion of White pre-armistice regular infantry deaths from non-combat causes
wrnc = sum(data_core_np$noncombat_death[data_core_np$black==0])/
  (sum(data_core_np$dead_total[data_core_np$black==0])-sum(data_core_np$cause_unknown[data_core_np$black==0])) 
wrnc  # 0.11

# Proportion of Black pre-armistice *pioneer* infantry deaths from non-combat causes
bpnc = sum(data_core_p$noncombat_death[data_core_p$black==1])/
  (sum(data_core_p$dead_total[data_core_p$black==1])-sum(data_core_p$cause_unknown[data_core_p$black==1]))
bpnc  # 0.95

# Proportion of White pre-armistice *pioneer* infantry deaths from non-combat causes
wpnc = sum(data_core_p$noncombat_death[data_core_p$black==0])/
  (sum(data_core_p$dead_total[data_core_p$black==0])-sum(data_core_p$cause_unknown[data_core_p$black==0])) 
wpnc  # 0.88

# Put individual-level non-combat death data together
ncd = c(brnc, bpnc, wrnc, wpnc) 
nc = data.frame(values=ncd,
                race=c(rep("Black", 2), rep("White", 2)),
                type=rep(c("Infantry", "Pioneer"), 2))

# Create plot
ggplot(nc) + 
  geom_bar(aes(fill=race, x=type, y=values), stat="identity", position="dodge", width=0.7) + 
  theme_bw() + scale_fill_manual("Race", values = c("Black" = "red4", "White" = "dodgerblue1")) + xlab("") + 
  ylab("Proportion Non-Combat Deaths") + scale_y_continuous(breaks=c(seq(0,1, by=0.2))) 
ggsave("./figures/Fig5.pdf", width=4.75, height=3.25)

# How many cases have identified known cause of death?
1 - sum(data_all$cause_unknown)/sum(data_all$dead_total) # 0.967 



##### Table 4: Cause of death by race and unit type #####

# Regular infantry pre-armistice non-combat deaths: unit of analysis is all deaths
blacknc = sum(data_core_np$noncombat_death[data_core_np$black==1])
blackc = sum(data_core_np$dead_total[data_core_np$black==1])-sum(data_core_np$cause_unknown[data_core_np$black==1]) - blacknc
regblacknc = c(rep(1, blacknc), rep(0, blackc))

whitenc = sum(data_core_np$noncombat_death[data_core_np$black==0])
whitec = sum(data_core_np$dead_total[data_core_np$black==0])-sum(data_core_np$cause_unknown[data_core_np$black==0]) - whitenc
regwhitenc = c(rep(1, whitenc), rep(0, whitec))

# t-test of the difference
t.test(regblacknc, regwhitenc) # p < 0.01

# Pioneer infantry pre-armistice non-combat deaths: unit of analysis is all deaths
blackncp = sum(data_core_p$noncombat_death[data_core_p$black==1])
blackcp = sum(data_core_p$dead_total[data_core_p$black==1])-sum(data_core_p$cause_unknown[data_core_p$black==1]) - blackncp
regblackncp = c(rep(1, blackncp), rep(0, blackcp))

whitencp = sum(data_core_p$noncombat_death[data_core_p$black==0])
whitecp = sum(data_core_p$dead_total[data_core_p$black==0])-sum(data_core_p$cause_unknown[data_core_p$black==0]) - whitencp
regwhitencp = c(rep(1, whitencp), rep(0, whitecp))

# t-test of the difference
t.test(regblackncp, regwhitencp) # p < 0.01

# Regression for regular infantry, pre-armistice
regblackncdf = data.frame(cbind(rep(1, length(regblacknc)), regblacknc))
colnames(regblackncdf) = c("black","noncombat")
regwhitencdf = data.frame(cbind(rep(0, length(regwhitenc)), regwhitenc))
colnames(regwhitencdf) = c("black","noncombat")
regncdf = rbind(regblackncdf, regwhitencdf)

regnc = lm(noncombat ~ black, regncdf)
summary(regnc)

# Regression for pioneer infantry, pre-armistice
regblackncdfp = data.frame(cbind(rep(1, length(regblackncp)), regblackncp))
colnames(regblackncdfp) = c("black","noncombat")
regwhitencdfp = data.frame(cbind(rep(0, length(regwhitencp)), regwhitencp))
colnames(regwhitencdfp) = c("black","noncombat")
regncdfp = rbind(regblackncdfp, regwhitencdfp)

pionnc = lm(noncombat ~ black, regncdfp)
summary(pionnc)

# Produce Table 5
stargazer(regnc, pionnc, 
          align=F, style="apsr",
          omit.stat=c("f","ser","adj.rsq","rsq"),
          digits=2, covariate.labels=c("Black"))




#### APPENDIX MATERIALS ####

##### Appendix 1: Descriptive Statistics #####

###### Table A1: Summary Statistics: Regiment-Half Month Unit of Analysis ######

# Pre-armistice
keep = c("black","dead_spec","dead_total","combat_death","noncombat_death","core","pioninf")
temp = data_all[c(keep)]
temp_pre = temp |> filter(core==1 & pioninf==0) |> dplyr::select(-c("core", "pioninf"))

var_nobs = apply(temp_pre, 2, function(x) {sum(!is.na(x))})
var_means_pre = round(apply(temp_pre, 2, mean), 2)
var_stds_pre = round(apply(temp_pre, 2, sd), 2)
var_mins_pre = apply(temp_pre, 2, min)
var_maxs_pre = apply(temp_pre, 2, max)

xtable::xtable(data.frame(var_nobs, var_means_pre, var_stds_pre, var_mins_pre, var_maxs_pre), digits=2)

# Post-armistice
keep = c("black","noncombat_death","post","pioninf")
temp = data_all[c(keep)]
temp_post = temp |> filter(post==1 & pioninf==0) |> dplyr::select(-c("post", "pioninf"))

var_nobs = apply(temp_post, 2, function(x) {sum(!is.na(x))})
var_means_post = round(apply(temp_post, 2, mean), 2)
var_stds_post = round(apply(temp_post, 2, sd), 2)
var_mins_post = apply(temp_post, 2, min)
var_maxs_post = apply(temp_post, 2, max)

xtable::xtable(data.frame(var_nobs, var_means_post, var_stds_post, var_mins_post, var_maxs_post), digits=2)


##### Appendix 2: Calculations for Occupational Assignment Results #####

# A single plot with all deployments, raw numbers
unit_types_3 = unit_types %>% arrange(combatrole, n_overseas)
unit_types_3$order = 1:nrow(unit_types_3)

unit_types_3 = unit_types_3 |>
  pivot_longer(cols=c("n_white_overseas", "n_black_overseas"), names_to="race", values_to = "total")
unit_types_3$total[which(is.na(unit_types_3$total))] = 0

unit_types_3 = unit_types_3 |>
  mutate(class = ifelse(race=="n_white_overseas" & combatrole=="combat", "White Combat",
                        ifelse(race=="n_white_overseas" & combatrole=="support", "White Support",
                               ifelse(race=="n_black_overseas" & combatrole=="combat", "Black Combat",
                                      "Black Support"))),
         class = factor(class, levels=c("White Combat", "Black Combat", "White Support", "Black Support")),
         race = ifelse(class %in% c("White Combat", "White Support"), "White", "Black"),
         id2 = gsub(" / ", "\n", id),
         id2 = gsub("Q. M. C. Special and Technical Units\n|M. T. C. Special and Technical Units\n", "", id2))


###### Figure A1: Number of deployed units by occupational specialty, conditional on that specialty including at least one Black-designed unit ######

# For the figure, only focus on roles where there were also Black soldiers
unit_types_3 |> filter(black_personnel_overseas > 0) |>
  ggplot(aes(reorder(id2, order), total)) +
  geom_bar(aes(fill=class), stat="identity") +
  theme_bw() +
  scale_fill_manual("", values=c("White Combat" = "tomato", "White Support" = "skyblue1", "Black Combat" = "red4", "Black Support" = "dodgerblue4")) +
  theme(axis.text.x = element_text(angle=60, vjust=1, hjust=1, size=6)) +
  theme(panel.grid.major.x = element_line(linewidth=0.1), axis.ticks = element_blank()) +
  xlab("Unit Type") +
  ylab("Units Deployed") +
  coord_cartesian(xlim=c(0,19))
ggsave("./figures/FigA1.pdf", width=7, height=5)

###### Figure A2: Number of deployed units by occupational specialty, including those with no Black-designated units ######
ggplot(unit_types_3, aes(reorder(id, order), total)) + 
  geom_bar(aes(fill=class), stat="identity") +
  theme_bw() + 
  scale_fill_manual("", values=c("White Combat" = "tomato", "White Support" = "skyblue", "Black Combat" = "red4", "Black Support" = "steelblue4")) + 
  theme(axis.text.x=element_blank(), panel.grid.major.x = element_line(linewidth=0.1), axis.ticks = element_blank()) + 
  xlab("Unit Type") +
  ylab("Units Deployed") +
  coord_cartesian(xlim=c(0,167))
ggsave("./figures/FigA2.pdf", width=7, height=4)


##### Appendix 3: Raw Counts of Unit-Level Casualties Over Time #####


###### Figure A3: Raw counts of US combat fatalities by regular infantry regiment ######
combat_deaths_unit_data |> filter(periodct <= 32) |>
  ggplot(aes(periodct, dead_spec)) + geom_point(aes(color=type), alpha=0.15) +
  theme_bw() + scale_color_manual("Regiment", values=c("red4","dodgerblue1")) +
  ylab("Unit Fatalities") + xlab("Period (Year-Month-Half)") +
  scale_x_continuous(breaks=seq(11, 32, 8), labels=datelabels_core,
                     minor_breaks = seq(10, 32, 1)) +
  geom_line(data=combat_deaths_avg_data, aes(periodct, dead_spec, color=type), linewidth=1) +
  facet_grid(cols = vars(type)) + coord_cartesian(ylim=c(0, 400)) +
  theme(legend.position = "none", panel.spacing = unit(0.75, "lines"))
ggsave("./figures/FigA3.pdf", width=5, height=3.75)


###### Figure A4: Raw counts of post-armistice non-combat US fatalities by regular infantry regiment ######
noncombat_deaths_unit_data |>
  ggplot(aes(periodct, noncombat_dead)) + 
  geom_point(aes(color=type, alpha=type)) + theme_bw() + 
  scale_color_manual("Regiment", values=c("red4","dodgerblue1")) + 
  ylab("Unit Fatalities") + xlab("Period (Year-Month-Half)") + 
  scale_x_continuous(breaks=seq(31, 55, 14), labels=datelabels_post, minor_breaks = seq(31, 55, 1)) + 
  geom_line(data=noncombat_deaths_avg_data, aes(periodct, noncombat_dead, color=type), linewidth=1) + coord_cartesian(xlim=c(32, 54)) +
  facet_grid(cols = vars(type)) +
  scale_alpha_manual(values=c(0.3, 0.08)) +
  theme(legend.position = "none")
ggsave("./figures/FigA4.pdf", width=5, height=3.75)


##### Appendix 4: Robustness Tests #####

###### Appendix 4.1: Unit Size Variation ######

### Robustness of pre-armistice fatalities by race 
## Baseline Table of Organization and Equipment (TO&E) size for regiment is 3,700
toe = 3700
data_core_np$toe = toe
data_core_np$dead_spec_rate = data_core_np$dead_spec/data_core_np$toe*100

# Create race-specific TO&E levels going up/down in 5 percent increments (i.e., a 10 percent difference overall)
increment = toe*0.05

# Make adjustments so that black units have fewer people, as this would have to be bias for it to really be null
data_core_np = data_core_np |>
  mutate(toew5 = NA,
         toew10 = NA,
         toew15 = NA, 
         toew20 = NA)

data_core_np$toew5[data_core_np$black==0] = toe+increment*1
data_core_np$toew5[data_core_np$black==1] = toe-increment*1
data_core_np$dead_spec_rate_w5 = data_core_np$dead_spec/data_core_np$toew5*100

data_core_np$toew10[data_core_np$black==0] = toe+increment*2
data_core_np$toew10[data_core_np$black==1] = toe-increment*2
data_core_np$dead_spec_rate_w10 = data_core_np$dead_spec/data_core_np$toew10*100

data_core_np$toew15[data_core_np$black==0] = toe+increment*3
data_core_np$toew15[data_core_np$black==1] = toe-increment*3
data_core_np$dead_spec_rate_w15 = data_core_np$dead_spec/data_core_np$toew15*100

data_core_np$toew20[data_core_np$black==0] = toe+increment*4
data_core_np$toew20[data_core_np$black==1] = toe-increment*4
data_core_np$dead_spec_rate_w20 = data_core_np$dead_spec/data_core_np$toew20*100

# Run the models with period fixed effects 
rate = lm(dead_spec_rate ~ black + as.factor(period), data=data_core_np)
ratew5 = lm(dead_spec_rate_w5 ~ black + as.factor(period), data=data_core_np)
ratew10 = lm(dead_spec_rate_w10 ~ black + as.factor(period), data=data_core_np)
ratew15 = lm(dead_spec_rate_w15 ~ black + as.factor(period), data=data_core_np)
ratew20 = lm(dead_spec_rate_w20 ~ black + as.factor(period), data=data_core_np)

###### Table A2: Unit Size Variation - Pre-Armistice Fatalities by Race ######
stargazer(rate, ratew5, ratew10, ratew15, ratew20, 
          align=F, style="apsr", digits=2, 
          covariate.labels=c("Black Regiment"),
          omit=c("period"))
 

### Robustness of post-armistice non-combat fatalities by race 
## Baseline Table of Organization and Equipment (TO&E) size for regiment is 3,700
toe = 3700
data_post_np$toe = toe
data_post_np$noncombat_death_rate = data_post_np$noncombat_death/data_post_np$toe*100

# Create race-specific TO&E levels going up/down in 5 percent increments (i.e., a 10 percent difference overall)
increment = toe*0.05

# Make adjustments so that black units have fewer people, as this would have to be bias for it to really be null
data_post_np = data_post_np |>
  mutate(toew5 = NA,
         toew10 = NA,
         toew15 = NA, 
         toew20 = NA)

data_post_np$toeb5[data_post_np$black==1] = toe+increment*1
data_post_np$toeb5[data_post_np$black==0] = toe-increment*1
data_post_np$noncombat_death_rate_b5 = data_post_np$noncombat_death/data_post_np$toeb5*100

data_post_np$toeb10[data_post_np$black==1] = toe+increment*2
data_post_np$toeb10[data_post_np$black==0] = toe-increment*2
data_post_np$noncombat_death_rate_b10 = data_post_np$noncombat_death/data_post_np$toeb10*100

data_post_np$toeb15[data_post_np$black==1] = toe+increment*3
data_post_np$toeb15[data_post_np$black==0] = toe-increment*3
data_post_np$noncombat_death_rate_b15 = data_post_np$noncombat_death/data_post_np$toeb15*100

data_post_np$toeb20[data_post_np$black==1] = toe+increment*4
data_post_np$toeb20[data_post_np$black==0] = toe-increment*4
data_post_np$noncombat_death_rate_b20 = data_post_np$noncombat_death/data_post_np$toeb20*100


# Run the models with period fixed effects
ncr = lm(noncombat_death_rate ~ black + as.factor(period), data=data_post_np)
ncr5 = lm(noncombat_death_rate_b5 ~ black + as.factor(period), data=data_post_np)
ncr10 = lm(noncombat_death_rate_b10 ~ black + as.factor(period), data=data_post_np)
ncr15 = lm(noncombat_death_rate_b15 ~ black + as.factor(period), data=data_post_np)
ncr20 = lm(noncombat_death_rate_b20 ~ black + as.factor(period), data=data_post_np)

###### Table A3: Unit Size Variation - Post-Armistice Non-Combat Fatalities by Race ######
stargazer(ncr, ncr5, ncr10, ncr15, ncr20, 
          align=F, style="apsr", digits=2,
          covariate.labels=c("Black Regiment"),
          omit=c("period"))


###### Appendix 4.2: Count Models #####

# Negative binomial models for pre-armistice fatalities 
regmainnb  =  glm.nb(dead_spec ~ black, data=data_core_np)
regmainFEnb  =  glm.nb(dead_spec ~ black + as.factor(period), data=data_core_np)

#### Table A4: Negative Binomial - Pre-Armistice Fatalities by Race ####
stargazer(regmainnb, regmainFEnb,
          align=F, style="apsr",
          omit.stat=c("f","ser","adj.rsq","rsq"),
          digits=2,
          covariate.labels=c("Black Regiment"),
          omit=c("period"))

# Negative binomial models for post-armistice non-combat fatalities
regpostncnb = glm.nb(noncombat_death ~ black, data=data_post_np)
regpostFEncnb = glm.nb(noncombat_death ~ black + as.factor(period), data=data_post_np)

###### Table A5: Negative Binomial - Post-Armistice Non-Combat Fatalities by Race ######
stargazer(regpostncnb, regpostFEncnb,
          align=F, style="apsr",
          omit.stat=c("f","ser","adj.rsq","rsq"),
          digits=2,
          covariate.labels=c("Black Regiment"),
          omit=c("period"))


###### Appendix 4.3: Focusing on Units under AEF Command ######

# Create dummy variable for which units were under French (not AEF) command
data_core_np$frenchsimple = 0
data_core_np$frenchsimple[(data_core_np$unit=="369 Inf"|data_core_np$unit=="370 Inf"|data_core_np$unit=="371 Inf"|data_core_np$unit=="372 Inf")] = 1

# Regressions excluding observations of units under French command 
regmainfr = lm(dead_spec ~ black, data=data_core_np[data_core_np$frenchsimple==0,])
regmainFEfr = lm(dead_spec ~ black + as.factor(period), data=data_core_np[data_core_np$frenchsimple==0,])


###### Table A6: Removing Black units under French command - Pre-Armistice Fatalities by Race ######
stargazer(regmainfr, regmainFEfr,
          align=F, style="apsr",
          omit.stat=c("f","ser","adj.rsq","rsq"),
          digits=2,
          covariate.labels=c("Black Regiment"),
          omit=c("period"))


##### Appendix 4.4: Inclusion of Different Race Fatalities #####

# Regressions using deaths of all races in a regiment as outcome
regmainall = lm(dead_total ~ black, data=data_core_np)
regmainFEall = lm(dead_total ~ black + as.factor(period), data=data_core_np)


###### Table A7: Including All Deaths - Pre-Armistice Fatalities by Race ######
stargazer(regmainall, regmainFEall,
          align=F, style="apsr",
          omit.stat=c("f","ser","adj.rsq","rsq"),
          digits=2,
          covariate.labels=c("Black Regiment"),
          omit=c("period"))